import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
df = pd.read_csv("D:/מבוא למדעי הנתונים/מטלה 1/df_after_missing_values_treatment.csv")
df.head(10)
| Unnamed: 0 | age | sex | marital_status | ses | residence | weigh | heigh | BMI | bp_sys | ... | Charlson | framingham_cvd | antidiabetics | ERD | CVD | HTN | cardiovascular_meds | statines | immigrant | dead_5y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 49.0 | 2 | U | 1 | urban | 2 | 160.0 | 32.79 | 1 | ... | 1 | 3 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 |
| 1 | 1 | 46.0 | 2 | M | 1 | urban | 1 | 152.0 | 29.38 | 1 | ... | 1 | 2 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 2 | 2 | 46.0 | 2 | U | 1 | urban | 1 | 148.0 | 28.31 | 4 | ... | 1 | 3 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 |
| 3 | 3 | 45.0 | 1 | U | 2 | urban | 1 | 166.0 | 26.85 | unknown | ... | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 4 | 49.0 | 2 | U | 1 | urban | 1 | 161.0 | 20.45 | 1 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5 | 5 | 41.0 | 1 | M | 1 | urban | 4 | 193.0 | 28.86 | 1 | ... | 1 | 4 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 6 | 6 | 46.0 | 2 | U | 2 | urban | 3 | 172.0 | 30.27 | 3 | ... | 1 | 3 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 7 | 7 | 47.0 | 1 | M | 2 | urban | 2 | 175.0 | 27.92 | 2 | ... | 2 | 3 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 8 | 8 | 42.0 | 1 | M | 1 | urban | 3 | 175.0 | 28.41 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 9 | 44.0 | 2 | M | 1 | urban | 1 | 164.0 | 26.77 | 4 | ... | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 42 columns
#### מוחק את שורת האינדקסים שנוצרה
df.drop(['Unnamed: 0'], axis = 1, inplace=True)
df.shape
(12438, 41)
#dividing the variables into numirical and categorical
num_vars = ['age','heigh','BMI','creatinin','albumin','cholesterol_total','LDL','HDL','TSH','WBC','RBC','platelets','MPV']
cat_vars = [col for col in df.columns if col not in num_vars]
num_vars_len = len(num_vars)
cat_vars_len = len(cat_vars)
#change the categorical variables to categoy type
for column in df[cat_vars]:
df[column] = df[column].astype('category')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12438 entries, 0 to 12437 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 12438 non-null float64 1 sex 12438 non-null category 2 marital_status 12438 non-null category 3 ses 12438 non-null category 4 residence 12438 non-null category 5 weigh 12438 non-null category 6 heigh 12438 non-null float64 7 BMI 12438 non-null float64 8 bp_sys 12438 non-null category 9 bp_dias 12438 non-null category 10 bp_cat 12438 non-null category 11 smoking_status 12438 non-null category 12 HbA1c 12438 non-null category 13 glucose 12438 non-null category 14 creatinin 12438 non-null float64 15 albumin 12438 non-null float64 16 ACR 12438 non-null category 17 cholesterol_total 12438 non-null float64 18 LDL 12438 non-null float64 19 HDL 12438 non-null float64 20 triglycerides 12438 non-null category 21 TSH 12438 non-null float64 22 gravity_u 12438 non-null category 23 nitrites_u 12438 non-null category 24 leuko_u 12438 non-null category 25 proteinuria 12438 non-null category 26 WBC 12438 non-null float64 27 RBC 12438 non-null float64 28 platelets 12438 non-null float64 29 MCV 12438 non-null category 30 MPV 12438 non-null float64 31 Charlson 12438 non-null category 32 framingham_cvd 12438 non-null category 33 antidiabetics 12438 non-null category 34 ERD 12438 non-null category 35 CVD 12438 non-null category 36 HTN 12438 non-null category 37 cardiovascular_meds 12438 non-null category 38 statines 12438 non-null category 39 immigrant 12438 non-null category 40 dead_5y 12438 non-null category dtypes: category(28), float64(13) memory usage: 1.6 MB
describe = pd.DataFrame
for column in df[num_vars]:
print('IQR:', df[column].quantile(.75) - df[column].quantile(.25))
print(df[column].describe())
print('--------------------')
IQR: 7.0 count 12438.000000 mean 43.596227 std 4.666418 min 31.000000 25% 41.000000 50% 45.000000 75% 48.000000 max 49.000000 Name: age, dtype: float64 -------------------- IQR: 15.0 count 12438.000000 mean 167.955357 std 9.925608 min 139.000000 25% 160.000000 50% 168.000000 75% 175.000000 max 197.000000 Name: heigh, dtype: float64 -------------------- IQR: 7.530000000000001 count 12438.000000 mean 31.433229 std 5.609614 min 15.790000 25% 27.400000 50% 31.050000 75% 34.930000 max 47.260000 Name: BMI, dtype: float64 -------------------- IQR: 0.22999999999999998 count 12438.000000 mean 0.717937 std 0.161310 min 0.300000 25% 0.600000 50% 0.710000 75% 0.830000 max 1.190000 Name: creatinin, dtype: float64 -------------------- IQR: 0.3099999999999996 count 12438.000000 mean 4.331609 std 0.282926 min 3.460000 25% 4.190000 50% 4.331609 75% 4.500000 max 5.160000 Name: albumin, dtype: float64 -------------------- IQR: 49.17499999999998 count 12438.000000 mean 181.609782 std 36.610662 min 79.700000 25% 156.000000 50% 180.000000 75% 205.175000 max 285.000000 Name: cholesterol_total, dtype: float64 -------------------- IQR: 41.7 count 12438.000000 mean 102.851480 std 30.728901 min 30.000000 25% 81.300000 50% 101.200000 75% 123.000000 max 189.000000 Name: LDL, dtype: float64 -------------------- IQR: 12.0 count 12438.000000 mean 41.556826 std 8.858572 min 16.000000 25% 35.000000 50% 41.000000 75% 47.000000 max 67.000000 Name: HDL, dtype: float64 -------------------- IQR: 1.299999809265099 count 12438.000000 mean 13.601263 std 0.991279 min 10.900000 25% 12.900000 50% 13.500000 75% 14.200000 max 16.400000 Name: TSH, dtype: float64 -------------------- IQR: 2.71 count 12438.000000 mean 8.132087 std 2.066043 min 2.600000 25% 6.700000 50% 8.000000 75% 9.410000 max 13.900000 Name: WBC, dtype: float64 -------------------- IQR: 0.6399999999999997 count 12438.000000 mean 4.939331 std 0.472472 min 3.600000 25% 4.610000 50% 4.939331 75% 5.250000 max 6.270000 Name: RBC, dtype: float64 -------------------- IQR: 89.0 count 12438.000000 mean 273.260533 std 67.175918 min 85.000000 25% 226.000000 50% 269.000000 75% 315.000000 max 463.000000 Name: platelets, dtype: float64 -------------------- IQR: 1.5 count 12438.000000 mean 9.290664 std 1.155942 min 6.200000 25% 8.500000 50% 9.200000 75% 10.000000 max 12.400000 Name: MPV, dtype: float64 --------------------
for column in df[cat_vars]:
print(df[column].value_counts())
print('--------------------')
1 6607 2 5831 Name: sex, dtype: int64 -------------------- M 8537 U 3901 Name: marital_status, dtype: int64 -------------------- 1 6281 2 3890 3 2267 Name: ses, dtype: int64 -------------------- urban 11659 rural 779 Name: residence, dtype: int64 -------------------- 4 3151 3 3123 1 2975 2 2935 unknown 254 Name: weigh, dtype: int64 -------------------- 4 3759 1 2766 2 2697 3 2001 unknown 1215 Name: bp_sys, dtype: int64 -------------------- 4 5566 2 4110 1 1479 3 759 unknown 524 Name: bp_dias, dtype: int64 -------------------- Normal 7496 Pre-HTN 2809 HTN-G1 1691 HTN-G2 367 HTN-G3 75 Name: bp_cat, dtype: int64 -------------------- non_smoker 7806 current_smoker 3287 past_smoker 1345 Name: smoking_status, dtype: int64 -------------------- 4 3160 3 3046 1 2878 2 2845 unknown 509 Name: HbA1c, dtype: int64 -------------------- 4 2965 3 2946 2 2907 1 2860 unknown 760 Name: glucose, dtype: int64 -------------------- unknown 2877 2 2815 4 2423 3 2364 1 1959 Name: ACR, dtype: int64 -------------------- 4 2963 3 2948 1 2938 2 2932 unknown 657 Name: triglycerides, dtype: int64 -------------------- unknown 2622 3 2537 2 2526 4 2484 1 2269 Name: gravity_u, dtype: int64 -------------------- 0.0 9567 unknown 2520 1.0 328 2.0 23 Name: nitrites_u, dtype: int64 -------------------- 0.0 7919 unknown 2507 500.0 568 25.0 524 75.0 341 250.0 238 100.0 233 1.0 43 2.0 35 4.0 19 3.0 11 Name: leuko_u, dtype: int64 -------------------- 0.0 9428 unknown 2506 1.0 504 Name: proteinuria, dtype: int64 -------------------- 4 3000 3 2997 2 2940 1 2933 unknown 568 Name: MCV, dtype: int64 -------------------- 1 6167 2 3465 3 1417 4 506 0 477 5 193 6 122 7 42 10 17 8 15 9 9 12 3 14 3 11 2 Name: Charlson, dtype: int64 -------------------- 4 2961 3 2957 1 2956 2 2956 unknown 608 Name: framingham_cvd, dtype: int64 -------------------- 1 10668 0 1770 Name: antidiabetics, dtype: int64 -------------------- 0 12356 1 82 Name: ERD, dtype: int64 -------------------- 0 11293 1 1145 Name: CVD, dtype: int64 -------------------- 0 8482 1 3956 Name: HTN, dtype: int64 -------------------- 1 7496 0 4942 Name: cardiovascular_meds, dtype: int64 -------------------- 1 8734 0 3704 Name: statines, dtype: int64 -------------------- 0 10263 1 2175 Name: immigrant, dtype: int64 -------------------- 0 12241 1 197 Name: dead_5y, dtype: int64 --------------------
fig, ax = plt.subplots(num_vars_len, 2, figsize = (15,50))
plt.subplots_adjust(hspace=0.5)
for i, var in enumerate(num_vars):
sns.histplot(data = df, x = var, ax = ax[i,0])
sns.boxplot(data = df, x = var, ax = ax[i,1])
fig, ax = plt.subplots(cat_vars_len, figsize = (4,70))
plt.subplots_adjust(hspace=1)
for i, var in enumerate(cat_vars):
sns.countplot(data = df, x = var, ax = ax[i])
tmp = []
for i, var1 in enumerate(num_vars):
for j, var2 in enumerate(num_vars):
if var1 != var2:
tmp.append(list(set([var1,var2])))
tmp.sort()
pairs = [x for i, x in enumerate(tmp) if i%2]
len(pairs)
78
fig, ax = plt.subplots(26, 3, figsize = (25,200))
plt.subplots_adjust(hspace=0.5)
for i, [var1, var2] in enumerate(pairs):
corr, p = spearmanr(df[var1], df[var2], nan_policy='omit')
title = 'corr: ' + str(round(corr,3)) + ' , ' + 'p value: ' + str(round(p,3))
sns.scatterplot(data = df, x = var1, y = var2, ax = ax[i//3,i%3])
ax[i//3,i%3].set_title(title, color = 'blue')
if abs(corr) >= 0.7:
ax[i//3,i%3].set_title(title, backgroundcolor='red')
elif p < 0.05:
ax[i//3,i%3].set_title(title, backgroundcolor='yellow')